# download_mdpi_final_titles_from_html.py
# MDPI (Acoustics, , Forecasting, Physics-SUI, Quantum Beam Science) Journals Downloader
# Automates downloading and renaming of PDFs from MDPI Physics Journal issues
# -------------------------------------------------------------------
# Workflow:
# - Prompts user for the path to a cleaned citations.txt (only containing article URLs)
# - Prompts user for the path to a locally saved issue.html (source of the MDPI issue page)
# - Extracts article titles from the local HTML by matching <a class="title-link" href="/...">
# - Uses Selenium to navigate to each article URL and locate its direct PDF link via <meta name="citation_pdf_url">
# - Downloads PDFs into a fixed "physics_journal" folder located next to the script
# - Renames PDFs based on the extracted titles from the HTML (sanitized for file safety)
# - Adds human-like delays to avoid race conditions in downloading and renaming
# 
# Features:
# - Keeps download logic identical to the proven stable version from download_mdpi_ok_names_not_ok.py
# - Avoids re-downloading files that already exist
# - Uses local HTML for title mapping (avoiding live requests that can cause 403 Forbidden errors)
# - Ensures all titles correspond to their correct article URLs before renaming
# 
# Usage:
# 1. Save the MDPI issue page source as "issue.html" (Right-click → View Page Source → Save As…)
# 2. Prepare a citations.txt file containing only the article URLs (one per line)
# 3. Place both files in the same directory as the script (or provide full paths when prompted)
# 4. Run the script: python download_mdpi_final_titles_from_html.py
# 5. PDFs will be saved and renamed in a folder named "mdpi_journal" alongside the script

import os, time, re
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

citations_file = input("Enter path to your citations file: ").strip()

download_dir = Path.home() / "Downloads" / "mdpi_journal"
download_dir.mkdir(parents=True, exist_ok=True)

chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": str(download_dir),
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "plugins.always_open_pdf_externally": True,
})

driver = webdriver.Chrome(options=chrome_options)

with open(citations_file, "r", encoding="utf-8") as f:
    urls = [u.strip() for u in f if u.strip()]

print(f"[INFO] Found {len(urls)} articles to download.")

def sanitize(name: str) -> str:
    name = re.sub(r'[\\/*?:"<>|]', "", name).strip()
    name = re.sub(r"\s+", " ", name)
    return name[:180]  # be safe on Windows paths

def wait_for_new_pdf(start_snapshot: set, timeout=90):
    """Return the single new PDF Path once it finishes (.crdownload gone)."""
    end_time = time.time() + timeout
    while time.time() < end_time:
        # if there’s an active download, Chrome creates *.crdownload
        if any(p.suffix == ".crdownload" for p in download_dir.glob("*.crdownload")):
            time.sleep(0.5)
            continue

        current = set(p for p in download_dir.glob("*.pdf"))
        new_files = current - start_snapshot
        if new_files:
            # choose the newest one (just in case)
            return max(new_files, key=lambda p: p.stat().st_mtime)
        time.sleep(0.5)
    raise TimeoutError("Timed out waiting for new PDF to appear.")

for i, url in enumerate(urls, 1):
    try:
        print(f"[{i}/{len(urls)}] {url}")
        before = set(p for p in download_dir.glob("*.pdf"))

        driver.get(url)
        h1 = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
        title = sanitize(h1.text) or f"mdpi_article_{i}"
        pdf_url = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//meta[@name='citation_pdf_url']"))
        ).get_attribute("content")

        # Kick off the download
        driver.get(pdf_url)

        # Wait for exactly the new file
        new_pdf = wait_for_new_pdf(before, timeout=180)

        # Build target name; avoid overwrite
        target = download_dir / f"{title}.pdf"
        if target.exists():
            # de-dupe with index suffix
            k = 2
            while True:
                alt = download_dir / f"{title} ({k}).pdf"
                if not alt.exists():
                    target = alt
                    break
                k += 1

        new_pdf.rename(target)
        print(f"    ✅ Saved as: {target.name}")

    except Exception as e:
        print(f"    ❌ Error: {e}")

driver.quit()
print("\n✅ All done! PDFs in:", download_dir)
